cpuidle: do not enter deep C state if there is urgent VCPU
authorKeir Fraser <keir.fraser@citrix.com>
Tue, 16 Feb 2010 09:27:45 +0000 (09:27 +0000)
committerKeir Fraser <keir.fraser@citrix.com>
Tue, 16 Feb 2010 09:27:45 +0000 (09:27 +0000)
when VCPU is polling on event channel, it usually has urgent task
running, e.g. spin_lock, in this case, it is better for cpuidle driver
not to enter deep C state.

This patch fix the issue that SLES 11 SP1 domain0 hangs in the box of
large number of CPUs (>= 64 CPUs).

Signed-off-by: Yu Ke <ke.yu@intel.com>
Signed-off-by: Tian Kevin <kevin.tian@intel.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
xen/arch/x86/acpi/cpu_idle.c
xen/common/sched_credit.c
xen/common/schedule.c
xen/include/xen/sched-if.h
xen/include/xen/sched.h

index 45030ba40200db145b0ec8d0461dd2aa4b21974d..f06580fbb4798afff5714ffc14abcaae16667e3d 100644 (file)
@@ -41,6 +41,7 @@
 #include <xen/keyhandler.h>
 #include <xen/cpuidle.h>
 #include <xen/trace.h>
+#include <xen/sched-if.h>
 #include <asm/cache.h>
 #include <asm/io.h>
 #include <asm/hpet.h>
@@ -216,6 +217,15 @@ static inline void trace_exit_reason(u32 *irq_traced)
     }
 }
 
+/* vcpu is urgent if vcpu is polling event channel
+ *
+ * if urgent vcpu exists, CPU should not enter deep C state
+ */
+static int sched_has_urgent_vcpu(void)
+{
+    return atomic_read(&this_cpu(schedule_data).urgent_count);
+}
+
 static void acpi_processor_idle(void)
 {
     struct acpi_processor_power *power = processor_powers[smp_processor_id()];
@@ -226,27 +236,7 @@ static void acpi_processor_idle(void)
     u32 exp = 0, pred = 0;
     u32 irq_traced[4] = { 0 };
 
-    cpufreq_dbs_timer_suspend();
-
-    sched_tick_suspend();
-    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
-    process_pending_softirqs();
-
-    /*
-     * Interrupts must be disabled during bus mastering calculations and
-     * for C2/C3 transitions.
-     */
-    local_irq_disable();
-
-    if ( softirq_pending(smp_processor_id()) )
-    {
-        local_irq_enable();
-        sched_tick_resume();
-        cpufreq_dbs_timer_resume();
-        return;
-    }
-
-    if ( max_cstate > 0 && power && 
+    if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
          (next_state = cpuidle_current_governor->select(power)) > 0 )
     {
         cx = &power->states[next_state];
@@ -263,6 +253,24 @@ static void acpi_processor_idle(void)
             pm_idle_save();
         else
             acpi_safe_halt();
+        return;
+    }
+
+    cpufreq_dbs_timer_suspend();
+
+    sched_tick_suspend();
+    /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+    process_pending_softirqs();
+
+    /*
+     * Interrupts must be disabled during bus mastering calculations and
+     * for C2/C3 transitions.
+     */
+    local_irq_disable();
+
+    if ( softirq_pending(smp_processor_id()) )
+    {
+        local_irq_enable();
         sched_tick_resume();
         cpufreq_dbs_timer_resume();
         return;
index b0ccb0ccacb3376de550e464a9bab00d02a5897d..914022ebb92fcf85c7ae3c1d5e8148df1d4ebf10 100644 (file)
@@ -1060,6 +1060,7 @@ csched_runq_steal(int peer_cpu, int cpu, int pri)
                 /* We got a candidate. Grab it! */
                 CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
                 CSCHED_STAT_CRANK(migrate_queued);
+                BUG_ON(vc->is_urgent);
                 __runq_remove(speer);
                 vc->processor = cpu;
                 return speer;
index 3b4be55d79ba2f8a841ba68a97cb039259492736..d02eb1f0af8739a9d8465b752c87c9f0c1705cb4 100644 (file)
@@ -100,6 +100,29 @@ static inline void trace_continue_running(struct vcpu *v)
                 (unsigned char *)&d);
 }
 
+static inline void vcpu_urgent_count_update(struct vcpu *v)
+{
+    if ( is_idle_vcpu(v) )
+        return;
+
+    if ( unlikely(v->is_urgent) )
+    {
+        if ( !test_bit(v->vcpu_id, v->domain->poll_mask) )
+        {
+            v->is_urgent = 0;
+            atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
+        }
+    }
+    else
+    {
+        if ( unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
+        {
+            v->is_urgent = 1;
+            atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
+        }
+    }
+}
+
 static inline void vcpu_runstate_change(
     struct vcpu *v, int new_state, s_time_t new_entry_time)
 {
@@ -108,6 +131,8 @@ static inline void vcpu_runstate_change(
     ASSERT(v->runstate.state != new_state);
     ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
 
+    vcpu_urgent_count_update(v);
+
     trace_runstate_change(v, new_state);
 
     delta = new_entry_time - v->runstate.state_entry_time;
@@ -188,6 +213,8 @@ void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->periodic_timer);
     kill_timer(&v->singleshot_timer);
     kill_timer(&v->poll_timer);
+    if ( test_and_clear_bool(v->is_urgent) )
+        atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
     SCHED_OP(destroy_vcpu, v);
 }
 
@@ -277,7 +304,7 @@ void vcpu_unblock(struct vcpu *v)
 static void vcpu_migrate(struct vcpu *v)
 {
     unsigned long flags;
-    int old_cpu;
+    int old_cpu, new_cpu;
 
     vcpu_schedule_lock_irqsave(v, flags);
 
@@ -293,9 +320,23 @@ static void vcpu_migrate(struct vcpu *v)
         return;
     }
 
-    /* Switch to new CPU, then unlock old CPU. */
+    /* Select new CPU. */
     old_cpu = v->processor;
-    v->processor = SCHED_OP(pick_cpu, v);
+    new_cpu = SCHED_OP(pick_cpu, v);
+
+    /*
+     * Transfer urgency status to new CPU before switching CPUs, as once
+     * the switch occurs, v->is_urgent is no longer protected by the per-CPU
+     * scheduler lock we are holding.
+     */
+    if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
+    {
+        atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
+        atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
+    }
+
+    /* Switch to new CPU, then unlock old CPU. */
+    v->processor = new_cpu;
     spin_unlock_irqrestore(
         &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
 
index 5caf8245d2079d98ec277ac22d2e4438e70f0700..ed0575773063a40f2c00d4bf66e279424b7b8196 100644 (file)
@@ -16,6 +16,7 @@ struct schedule_data {
     struct vcpu        *idle;           /* idle task for this cpu          */
     void               *sched_priv;
     struct timer        s_timer;        /* scheduling timer                */
+    atomic_t            urgent_count;   /* how many urgent vcpus           */
 } __cacheline_aligned;
 
 DECLARE_PER_CPU(struct schedule_data, schedule_data);
index d9180773f94664656639913a6afede4db3984faa..2b2eca3bb8a1aa566095c01361fcc6b084ab7915 100644 (file)
@@ -115,6 +115,8 @@ struct vcpu
     bool_t           is_initialised;
     /* Currently running on a CPU? */
     bool_t           is_running;
+    /* VCPU should wake fast (do not deep sleep the CPU). */
+    bool_t           is_urgent;
 
 #ifdef VCPU_TRAP_LAST
 #define VCPU_TRAP_NONE    0